covid <-read.csv("data/raw/COVID19Cases_geoRegion.csv")covid <- covid %>%mutate( datum =as.Date(datum) ) head(covid)
geoRegion datum entries sumTotal timeframe_14d timeframe_all
1 CH 2020-02-24 1 1 FALSE TRUE
2 CH 2020-02-25 1 2 FALSE TRUE
3 CH 2020-02-26 10 12 FALSE TRUE
4 CH 2020-02-27 10 22 FALSE TRUE
5 CH 2020-02-28 10 32 FALSE TRUE
6 CH 2020-02-29 13 45 FALSE TRUE
offset_last7d sumTotal_last7d offset_last14d sumTotal_last14d offset_last28d
1 4385008 0 4383801 0 4376250
2 4385008 0 4383801 0 4376250
3 4385008 0 4383801 0 4376250
4 4385008 0 4383801 0 4376250
5 4385008 0 4383801 0 4376250
6 4385008 0 4383801 0 4376250
sumTotal_last28d sum7d sum14d mean7d mean14d entries_diff_last_age pop
1 0 NA NA NA NA 7 8738791
2 0 NA NA NA NA 7 8738791
3 0 NA NA NA NA 7 8738791
4 0 NA NA 8.14 NA 7 8738791
5 0 NA NA 12.29 NA 7 8738791
6 0 NA NA 16.86 NA 7 8738791
inz_entries inzsumTotal inzmean7d inzmean14d inzsumTotal_last7d
1 0.01 0.01 NA NA NA
2 0.01 0.02 NA NA NA
3 0.11 0.14 NA NA NA
4 0.11 0.25 0.09 NA NA
5 0.11 0.37 0.14 NA NA
6 0.15 0.51 0.19 NA NA
inzsumTotal_last14d inzsumTotal_last28d inzsum7d inzsum14d sumdelta7d
1 NA NA NA NA NA
2 NA NA NA NA NA
3 NA NA NA NA NA
4 NA NA NA NA NA
5 NA NA NA NA NA
6 NA NA NA NA NA
inzdelta7d type type_variant version datum_unit
1 NA COVID19Cases NA 2023-01-24_06-03-16 day
2 NA COVID19Cases NA 2023-01-24_06-03-16 day
3 NA COVID19Cases NA 2023-01-24_06-03-16 day
4 NA COVID19Cases NA 2023-01-24_06-03-16 day
5 NA COVID19Cases NA 2023-01-24_06-03-16 day
6 NA COVID19Cases NA 2023-01-24_06-03-16 day
entries_letzter_stand entries_neu_gemeldet entries_diff_last
1 1 0 914
2 1 0 914
3 10 0 914
4 10 0 914
5 10 0 914
6 13 0 914
dim(covid)
[1] 30247 36
Data from the COVID-19 BAG dashboard: https://www.covid19.admin.ch/
dataframe setup: covid_cantons_2020
# filter data frame covid: # only keep confirmed cases in the cantons of Zurich, Bern and Vaud # in the first half of the year 2020covid_cantons_2020 <- covid %>%filter(datum <=as.Date("2020-06-30") & (geoRegion =="ZH"| geoRegion =="BE"| geoRegion =="VD"))# write data frame covid_cantons_2020 to a csv filewrite.csv(x = covid_cantons_2020, file ="data/processed/covid_cantons_2020_06.csv")
geoRegion datum entries sumTotal timeframe_14d timeframe_all
1 BE 2020-02-24 0 0 FALSE TRUE
2 BE 2020-02-25 0 0 FALSE TRUE
3 BE 2020-02-26 0 0 FALSE TRUE
4 BE 2020-02-27 1 1 FALSE TRUE
5 BE 2020-02-28 0 1 FALSE TRUE
6 BE 2020-02-29 1 2 FALSE TRUE
offset_last7d sumTotal_last7d offset_last14d sumTotal_last14d offset_last28d
1 507985 0 507871 0 507046
2 507985 0 507871 0 507046
3 507985 0 507871 0 507046
4 507985 0 507871 0 507046
5 507985 0 507871 0 507046
6 507985 0 507871 0 507046
sumTotal_last28d sum7d sum14d mean7d mean14d entries_diff_last_age pop
1 0 NA NA NA NA 7 1047473
2 0 NA NA NA NA 7 1047473
3 0 NA NA NA NA 7 1047473
4 0 NA NA 0.29 NA 7 1047473
5 0 NA NA 0.86 NA 7 1047473
6 0 NA NA 1.29 NA 7 1047473
inz_entries inzsumTotal inzmean7d inzmean14d inzsumTotal_last7d
1 0.0 0.00 NA NA NA
2 0.0 0.00 NA NA NA
3 0.0 0.00 NA NA NA
4 0.1 0.10 0.03 NA NA
5 0.0 0.10 0.08 NA NA
6 0.1 0.19 0.12 NA NA
inzsumTotal_last14d inzsumTotal_last28d inzsum7d inzsum14d sumdelta7d
1 NA NA NA NA NA
2 NA NA NA NA NA
3 NA NA NA NA NA
4 NA NA NA NA NA
5 NA NA NA NA NA
6 NA NA NA NA NA
inzdelta7d type type_variant version datum_unit
1 NA COVID19Cases NA 2023-01-24_06-03-16 day
2 NA COVID19Cases NA 2023-01-24_06-03-16 day
3 NA COVID19Cases NA 2023-01-24_06-03-16 day
4 NA COVID19Cases NA 2023-01-24_06-03-16 day
5 NA COVID19Cases NA 2023-01-24_06-03-16 day
6 NA COVID19Cases NA 2023-01-24_06-03-16 day
entries_letzter_stand entries_neu_gemeldet entries_diff_last
1 0 0 75
2 0 0 75
3 0 0 75
4 1 0 75
5 0 0 75
6 1 0 75
Determine what variables you need to include in your dataframe to make the type of plot shown below.
Create a dataframe with the required variables and all data for 3 countries before 31 March 2015.
Exercise 4A: solution
# load librarylibrary(dplyr)# read Ebola datadata_ebola <-read.csv("data/raw/ebola.csv")# format column datum of data_ebola as datedata_ebola$Date <-as.Date(data_ebola$Date)# sort data_ebola by datedata_ebola <-arrange(data_ebola, Date)head(data_ebola)
# filter data_ebola: cumulative number of confirmed cases in Guinea, # Liberia and Sierra Leone before 31 March 2015 data_ebola_cum_cases <- data_ebola %>%select(date = Date, country = Country, cum_conf_cases = Cum_conf_cases) %>%filter(date <=as.Date("2015-03-31") & (country =="Guinea"| country =="Liberia"| country =="Sierra Leone"))
Exercise 4B: basic plot
Create basic point, line and column plots of the cumulative number of confirmed cases versus time.
Exercise 4B: solution
# crete point plotplot_ebola_point_v0 <-ggplot(data = data_ebola_cum_cases, mapping =aes(x = date, y = cum_conf_cases)) +geom_point()# create line plotplot_ebola_line_v0 <-ggplot(data = data_ebola_cum_cases, mapping =aes(x = date, y = cum_conf_cases)) +geom_line(aes(group = country))# create column plotplot_ebola_col_v0 <-ggplot(data = data_ebola_cum_cases, mapping =aes(x = date, y = cum_conf_cases)) +geom_col(position ="stack")
X age sex bmi children smoker region charges date
1 1 59 male 31.790 2 no southeast 13086.341 2001-01-15
2 2 24 female 22.600 0 no southwest 2574.268 2001-01-17
3 3 28 female 25.935 1 no northwest 4411.400 2001-01-22
4 4 22 male 25.175 0 no northwest 2321.417 2001-01-29
5 5 60 female 36.005 0 no northeast 13434.551 2001-02-06
6 6 38 female 28.000 3 no southwest 7262.940 2001-02-17
dim(insurance)
[1] 1338 9
Data adapted from “Machine Learning with R” by Brett Lantz.
Density plot / histogram
Exercise 5A: Can you reproduce these graphs using the insurance.csv dataset?